Read & tidy the data

Load the required packages

library(tidyverse)
library(cowplot)
library(maps)
library(ggmap)
library(viridis)
library(plotly)
library(caret) #confusion matrix
library(e1071)
library(class) #knn.cv

Import the data

breweries = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv')
beer = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv')

Breweries data has 558 observations and four columns:

  1. Brew_ID: Unique identifier of the brewery
  2. Name: Name of the brewery
  3. City: City where brewery is located
  4. State: U.S. State where brewery is located
dim(breweries)
## [1] 558   4
spec(breweries)
## cols(
##   Brew_ID = col_double(),
##   Name = col_character(),
##   City = col_character(),
##   State = col_character()
## )

Beer data has 2410 observations and seven columns:

  1. Name: Name of the beer
  2. Beer_ID: Unique identifier of the beer
  3. ABV: Alcohol by volume of the beer
  4. IBU; International Bitterness Units of the beer
  5. Brewery_id: Brewery identifier associated with the beer
  6. Style: Style of the beer
  7. Ounces: Ounces of the beer
dim(beer)
## [1] 2410    7
spec(beer)
## cols(
##   Name = col_character(),
##   Beer_ID = col_double(),
##   ABV = col_double(),
##   IBU = col_double(),
##   Brewery_id = col_double(),
##   Style = col_character(),
##   Ounces = col_double()
## )

Merge datasets by primary key

df = merge(breweries, beer, by.x = 'Brew_ID', by.y = 'Brewery_id')
head(df)

Rename Name.x and Name.y

df = rename(df, Brewery = Name.x, Beer = Name.y)
head(df)

Clean the data:

  1. Check the sum of NA values in each column (ABV: 62, IBU: 1005, Style: 5)
  2. Create new dataframe excluding the NA values (but also keep original dataframe)
  3. Confirm there are zero NA values left
cbind(lapply(lapply(df, is.na), sum))
##         [,1]
## Brew_ID 0   
## Brewery 0   
## City    0   
## State   0   
## Beer    0   
## Beer_ID 0   
## ABV     62  
## IBU     1005
## Style   5   
## Ounces  0
final = df %>% na.exclude()
sum(as.numeric(is.na.data.frame(final)))
## [1] 0

Study the dataset

Final dataset has 1403 observations and 10 columns:
1. Brew_ID: Unique identifier of the brewery
2. Brewery: Name of the brewery
3. City: City where brewery is located
4. State: U.S. state where brewery is located
5. Beer: Name of the beer
6. Beer_ID: Unique identifier of the beer
7. ABV: Alcohol by volume of the beer
8. IBU: International Bitterness Units of the beer
9. Ounces: Ounces of the beer
10. Style: Style of the beer

dim(final)
## [1] 1403   10
str(final)
## 'data.frame':    1403 obs. of  10 variables:
##  $ Brew_ID: num  1 1 1 1 1 1 2 2 2 2 ...
##  $ Brewery: chr  "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
##  $ City   : chr  "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
##  $ State  : chr  "MN" "MN" "MN" "MN" ...
##  $ Beer   : chr  "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
##  $ Beer_ID: num  2689 2688 2687 2692 2691 ...
##  $ ABV    : num  0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
##  $ IBU    : num  38 25 47 50 26 19 42 68 80 25 ...
##  $ Style  : chr  "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
##  $ Ounces : num  16 16 16 16 16 16 16 16 16 16 ...
##  - attr(*, "na.action")= 'exclude' Named int [1:1007] 19 35 36 37 38 39 40 41 42 43 ...
##   ..- attr(*, "names")= chr [1:1007] "19" "35" "36" "37" ...

The first six and last rows of the dataframe

head(final)
tail(final)

Exploratory Data Analysis

There are 558 total breweries. I’ve plotted their distribution by state.

breweries %>% group_by(State) %>% tally(sort=T)

Plot into a heatmap

states = map_data('state')
st_abb = data.frame(abb = state.abb, region = tolower(state.name))
brew = breweries %>% group_by(State) %>% tally(sort=T)
brew = subset(brew, State != 'HI' & State != 'AK') 
brew = rename(brew, abb = State, count = n)
brew = merge(brew, st_abb, by='abb')
geo = merge(states, brew, by='region', all.x=T)
geo = geo[order(geo$order),]
center <- data.frame(region=tolower(state.name), long=state.center$x, lat=state.center$y)
center <- merge(center, brew, by="region", all.x = TRUE)
ggplot(geo, aes(x=long,y=lat)) +
geom_polygon(aes(group=group, fill=count)) +
geom_text(data=center, aes(long, lat, label=count)) +
scale_fill_gradient(low = "slategray1",
high = "royalblue4",
guide = "colorbar") +
ggtitle("Breweries per State") +
coord_map() + theme(axis.title = element_text(size = 20)) + theme_void() + theme(legend.position = c(0.9, 0.4), plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")))

I computed median ABV and IBU for each state. Maine has the highest median ABV at 6.7% as well as the highest median IBU at 61.

median = final %>% select(State, ABV, IBU) %>% group_by(State) %>% summarize(med_abv = median(ABV), med_ibu = median(IBU))
arrange(median, -med_abv)
arrange(median, -med_ibu)

Kentucky has the max ABV at 12.5%

max(final$ABV)
## [1] 0.125
grep(.125, final$ABV)
## [1] 9
final$State[9]
## [1] "KY"

Oregon has the highest IBU at 138

max(final$IBU)
## [1] 138
grep(138, final$IBU)
## [1] 1132
final$State[1132]
## [1] "OR"

I plotted median ABV for each state.

median %>% mutate(perc = med_abv*100, State = fct_reorder(State, perc)) %>% ggplot(aes(State, perc)) + geom_point(size=4, color="royalblue2") + coord_flip() + labs(y='ABV (%)', title='Median Alcohol by Volume of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))

Median IBU for each state.

median %>% mutate(State = fct_reorder(State, med_ibu)) %>% ggplot(aes(State, med_ibu)) + geom_point(size=4, color="indianred2") + coord_flip() + labs(y='IBU', title='Median International Bitterness Unit of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))

Summary statistics and distribution of ABV

Statistic Percent
Min 2.7
Med 5.7
Mean 6
Max 12.5
IQR 1.8

Right skew distribution suggests lower production at increasing ABV. Consumer preference (and thus sales) drives demand to infer most lean towards lower concentration of ABV. The median value of 5.7% is a better indicator of the distribution because the mean is sensitive to outliers and gets pulled towards the skewed tail.

Here is the boxplot & density plot of ABV distribution.

summary(final$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05700 0.05992 0.06800 0.12500
fivenum(final$ABV)
## [1] 0.027 0.050 0.057 0.068 0.125
IQR(final$ABV)
## [1] 0.018
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_boxplot(fill='royalblue', alpha=.3) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_cowplot() + theme(plot.title = element_text(size= 20), axis.ticks.y=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), legend.position = 'none', axis.title = element_text(size = 20)) + geom_segment(aes(x = 6, y = -.05, xend = 6, yend = .05, col='red'))

final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_density(fill='indianred2', color='indianred3', alpha=.9) + geom_vline(xintercept=5.7, col='cornflowerblue') + geom_vline(xintercept = 6, linetype='dotted', col='paleturquoise3', size=1.1) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

Relationship between ABV and IBU using linear regression model.

I performed a log transformation of the abv values due to its skewness and conducted a linear regression model. Just from the eye test, the regression line depicts a positive, linear relationship. In fact, with an R2 of .45, it is estimated that 45% of the variation in IBU is explained by abv. Furthermore, there is strong evidence to suggest abv and IBU are linearly correlated (p-val <.00001).The ABV values were log transformed before plotting.

final = final %>% mutate(log_abv = log(ABV))
cor.test(x=final$log_abv, y=final$IBU)
## 
##  Pearson's product-moment correlation
## 
## data:  final$log_abv and final$IBU
## t = 34.032, df = 1401, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6430317 0.7004025
## sample estimates:
##       cor 
## 0.6727271
model = lm(IBU~log_abv, data=final)
summary(model)
## 
## Call:
## lm(formula = IBU ~ log_abv, data = final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.393 -12.410  -0.622  12.989  91.579 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  272.589      6.773   40.24   <2e-16 ***
## log_abv       80.972      2.379   34.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.22 on 1401 degrees of freedom
## Multiple R-squared:  0.4526, Adjusted R-squared:  0.4522 
## F-statistic:  1158 on 1 and 1401 DF,  p-value: < 2.2e-16

ABV vs IBU

final %>% ggplot(aes(x=log_abv, y=IBU)) + geom_point(color="black", fill="#69b3a2", shape=22, alpha=0.5, size=3, stroke = 1) + geom_line(aes(x = log_abv,y = model$fit), color = "lightpink3", lwd=.8) + labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear Regression Model on Log Transformed ABV', x= 'ABV (logged)', y='IBU') + theme_classic() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

ABV vs. IBU by Style

View all 90 different styles

final %>% group_by(Style) %>% tally(sort=T)

Style df of just IPA/ale styles. 32 different styles of IPA and ale encompassing 944 observations and 10 columns

style = final %>% group_by(Style) %>% filter(grepl('\\sAle|IPA$', Style))
dim(style)
## [1] 944  11

Distinguish ale from ipa

all_style = final %>% group_by(Style)

keep = grep('\\sAle', all_style$Style)
ale = all_style[keep,]
rem = grep('India', ale$Style)
ale = ale[-rem,] #552 obs, 27 ale styles
ale$Style = 'ale'

ipa = grep(('\\sIndia\\sPale\\sAle|IPA$'), all_style$Style)
ipa = all_style[ipa,] # 392 obs, 5 ipa styles
ipa$Style = 'ipa'

ipa_ale = rbind(ipa, ale)
ipa_ale$Style = as.factor(ipa_ale$Style)

Code the plots

pmain = ipa_ale %>% ggplot(aes(x=ABV, y=IBU, shape=factor(Style))) + geom_point(aes(color=factor(Style))) + theme(legend.position = 'none', axis.title = element_text(size = 20), panel.background = element_rect(fill = "gray97", color = NA))

xdens = axis_canvas(pmain, axis='x') + geom_density(data=ipa_ale, aes(ABV, fill=Style), alpha=.7, size=.2)
ydens = axis_canvas(pmain, axis='y', coord_flip = TRUE) + geom_density(data=ipa_ale, aes(IBU, fill=Style), alpha=.7, size=.2) + coord_flip()
p1 = insert_xaxis_grob(pmain, xdens, grid::unit(.2, 'null'), position='top')
p2 = insert_yaxis_grob(p1, ydens, grid::unit(.2, 'null'), position='right')

Plot scatterplot with marginal plot on the axis

ggdraw(p2) + draw_label('ABV vs. IBU by Style', x=.12, y=.98, size=20) + draw_label('IPA', x=.52, y=.95, size=13, color='paleturquoise4', fontface = 'bold') + draw_label('Ale', x=.35, y=.97, size=13, color='rosybrown', fontface = 'bold')

Conduct KNN classifier to investigate the difference with respect to ABV and IBU between IPAs and other types of Ale. I normalized both variables and performed a knn cross validation test with a k of 11. The model achieved a 78% accuracy (how well it correctly identified the style), almost 84% sensitivity (how many ales did the model correctly identify), and 70% specificity (how many ipa did the model correctly identify). Additionally, I log transformed abv and ibu conducted a Welch’s two sample t-test to conclude that there is strong evidence to suggest the true difference in both median abv and ibu value is different for ale and ipa.

normalize = function(x) {
return ((x - min(x)) / (max(x) - min(x)))
                        }
ipa_ale = ipa_ale %>% mutate(norm_abv = normalize(ABV), norm_ibu = normalize(IBU))

model = knn.cv(ipa_ale[,c(12:13)], ipa_ale$Style, k=11)
confusionMatrix(model, ipa_ale$Style)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ale ipa
##        ale 459 117
##        ipa  93 275
##                                           
##                Accuracy : 0.7775          
##                  95% CI : (0.7496, 0.8037)
##     No Information Rate : 0.5847          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5378          
##                                           
##  Mcnemar's Test P-Value : 0.1125          
##                                           
##             Sensitivity : 0.8315          
##             Specificity : 0.7015          
##          Pos Pred Value : 0.7969          
##          Neg Pred Value : 0.7473          
##              Prevalence : 0.5847          
##          Detection Rate : 0.4862          
##    Detection Prevalence : 0.6102          
##       Balanced Accuracy : 0.7665          
##                                           
##        'Positive' Class : ale             
## 

Conduct t.test

ipa_ale = ipa_ale %>% mutate(logabv=log(ABV), logibu=log(IBU))
t.test(logabv~Style, data=ipa_ale)
## 
##  Welch Two Sample t-test
## 
## data:  logabv by Style
## t = -16.91, df = 849.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
##  -0.2260548 -0.1790364
## sample estimates:
## mean in group ale mean in group ipa 
##         -2.889941         -2.687395
t.test(logibu~Style, data=ipa_ale)
## 
##  Welch Two Sample t-test
## 
## data:  logibu by Style
## t = -30.993, df = 875.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
##  -0.8911237 -0.7849819
## sample estimates:
## mean in group ale mean in group ipa 
##          3.399569          4.237622

INSIGHT

Portland has the highest number of breweries with 39 and its max IBU is 103. Similarly, SD has the third highest number of breweries with 35 and its max IBU is 115. I’ve only given you two examples (due to time constraint), but there is a strong linear correlation between breweries per city and its maximum IBU (correlation coefficient of .79 and p-value of less than .00001). Additionally, it is estimated that 63% of the variation in max IBU is explained by its linear relationship with breweries per city. This leads me to infer cities with a high number of breweries have a more developed palate for crafty beers that have high IBU.

Why is this important? Boston is ranked 12th in breweries per city yet its maximum IBU is only 45. That’s 50% less than the max IBU compared to cities with similar demographics. In fact, if you take the max IBU of all the cities in MA and extract the median, Boston is still 30 IBU lower. I believe that Boston is an untapped market with a need for higher IBU beer production.

IBU distribution on US map (static plot)

states = map_data('state')
usabeer = final %>% select(City, State, ABV, IBU)
usabeer = subset(usabeer, State != 'HI' & State != 'AK')
usabeer$citystate = str_c(usabeer$City, ", ", usabeer$State)
usabeer = cbind(geocode(as.character(usabeer$citystate)), usabeer)
usabeer = usabeer %>% arrange(IBU) %>% mutate(cs=factor(citystate, unique(citystate)))

ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group), fill='grey', alpha=.3) + geom_point(data=usabeer, aes(lon, y=lat, size=IBU, color=IBU, alpha=IBU), shape=20) + scale_size_continuous(name='IBU', range=c(1,10), breaks=c(1, 25, 50, 75, 100)) + scale_alpha_continuous(name='IBU', range=c(.1, .9), breaks=c(1, 25, 50, 75, 100)) + scale_color_viridis(option='magma', direction=-1, breaks=c(1, 25, 50, 75, 100), name='IBU') + theme_void() + coord_map() + borders('state') + guides( colour = guide_legend()) +
ggtitle("Distribution of International Bitterness Unit by Cities") +
theme(
legend.position = c(0.85, 0.25),
text = element_text(color = "#22211d"),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -.5, t = 0.4, l = 2, unit = "cm")),
)

Plot IBU distribution

summary(final$IBU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   21.00   35.00   42.74   64.00  138.00
usabeer %>% ggplot(aes(IBU)) + geom_density(fill='royalblue', alpha=.7) + geom_vline(xintercept=35, col='red') + geom_vline(xintercept = 42.74, linetype='dotted', col='turquoise', size=1.1) + labs(title='International Bitterness Unit') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

IBU distribution on US map (PLOTLY)

usabeer = usabeer %>% mutate( mytext=paste(citystate, "\n", "IBU: ", IBU, sep=""))

p.usa = ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group)) + geom_point(data=usabeer, aes(x=lon, y=lat, size=IBU, color=IBU, alpha=IBU, text=mytext)) + scale_size_continuous(range=c(1,7)) + scale_color_viridis(option='inferno', direction = -1) + theme_void() + coord_map() + borders('state', fill='whitesmoke', alpha=.3) + labs(title='Distribution of International Bitterness Unit by Cities') + theme(legend.position = 'none', plot.title = element_text(size=25, hjust=.5))
p.usa = ggplotly(p.usa, tooltip='text')
p.usa

Number of breweries per city in descending order

nbrew = final %>% select(IBU, City, State) %>% group_by(City, State) %>% tally(sort=T)
nbrew

MAX IBU for each city

maxibu = final %>% select(IBU, City, State) %>% group_by(State, City) %>% slice(which.max(IBU))
maxibu = arrange(maxibu, -IBU)
maxibu

merge, conduct cor test

ibu.brew.cor = as.data.frame(cbind(maxibu$IBU, nbrew$n))
ibu.brew.cor = rename(ibu.brew.cor, max_ibu=V1, n_brew = V2)
cor(ibu.brew.cor) # .7939
##           max_ibu    n_brew
## max_ibu 1.0000000 0.7938968
## n_brew  0.7938968 1.0000000
cor.test(x=ibu.brew.cor$n_brew, y=ibu.brew.cor$max_ibu, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  ibu.brew.cor$n_brew and ibu.brew.cor$max_ibu
## t = 22.196, df = 289, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7471146 0.8328525
## sample estimates:
##       cor 
## 0.7938968
ma = maxibu %>% select(State, IBU) %>% filter(State == 'MA') %>% summarize(value = IBU)
## Adding missing grouping variables: `City`
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
mean(ma$value)
## [1] 68.09091
fivenum(ma$value)
## [1]  35.0  45.0  69.0  82.5 130.0